Index

Alignmnent and counting of the fastq files

This step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.

Library loading and set up

suppressMessages(
  c(library(DESeq2),
    library(limma),
    library(tximport),
    library(gridExtra),
    library(ensembldb),
    library(EnsDb.Mmusculus.v79),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(VennDiagram),
    library(clusterProfiler),
    library(DOSE),
    library(org.Mm.eg.db), 
    library(pathview),
    library(AnnotationDbi),
    library(knitr))
)
##   [1] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##   [4] "BiocParallel"         "matrixStats"          "Biobase"             
##   [7] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [10] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [13] "stats4"               "stats"                "graphics"            
##  [16] "grDevices"            "utils"                "datasets"            
##  [19] "methods"              "base"                 "limma"               
##  [22] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [25] "BiocParallel"         "matrixStats"          "Biobase"             
##  [28] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [31] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [34] "stats4"               "stats"                "graphics"            
##  [37] "grDevices"            "utils"                "datasets"            
##  [40] "methods"              "base"                 "tximport"            
##  [43] "limma"                "DESeq2"               "SummarizedExperiment"
##  [46] "DelayedArray"         "BiocParallel"         "matrixStats"         
##  [49] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [52] "IRanges"              "S4Vectors"            "BiocGenerics"        
##  [55] "parallel"             "stats4"               "stats"               
##  [58] "graphics"             "grDevices"            "utils"               
##  [61] "datasets"             "methods"              "base"                
##  [64] "gridExtra"            "tximport"             "limma"               
##  [67] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [70] "BiocParallel"         "matrixStats"          "Biobase"             
##  [73] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [76] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [79] "stats4"               "stats"                "graphics"            
##  [82] "grDevices"            "utils"                "datasets"            
##  [85] "methods"              "base"                 "ensembldb"           
##  [88] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
##  [91] "gridExtra"            "tximport"             "limma"               
##  [94] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [97] "BiocParallel"         "matrixStats"          "Biobase"             
## [100] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [103] "S4Vectors"            "BiocGenerics"         "parallel"            
## [106] "stats4"               "stats"                "graphics"            
## [109] "grDevices"            "utils"                "datasets"            
## [112] "methods"              "base"                 "EnsDb.Mmusculus.v79" 
## [115] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [118] "AnnotationDbi"        "gridExtra"            "tximport"            
## [121] "limma"                "DESeq2"               "SummarizedExperiment"
## [124] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [127] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [130] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [133] "parallel"             "stats4"               "stats"               
## [136] "graphics"             "grDevices"            "utils"               
## [139] "datasets"             "methods"              "base"                
## [142] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [145] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [148] "gridExtra"            "tximport"             "limma"               
## [151] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [154] "BiocParallel"         "matrixStats"          "Biobase"             
## [157] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [160] "S4Vectors"            "BiocGenerics"         "parallel"            
## [163] "stats4"               "stats"                "graphics"            
## [166] "grDevices"            "utils"                "datasets"            
## [169] "methods"              "base"                 "ggplot2"             
## [172] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [175] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [178] "gridExtra"            "tximport"             "limma"               
## [181] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [184] "BiocParallel"         "matrixStats"          "Biobase"             
## [187] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [190] "S4Vectors"            "BiocGenerics"         "parallel"            
## [193] "stats4"               "stats"                "graphics"            
## [196] "grDevices"            "utils"                "datasets"            
## [199] "methods"              "base"                 "lattice"             
## [202] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [205] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [208] "AnnotationDbi"        "gridExtra"            "tximport"            
## [211] "limma"                "DESeq2"               "SummarizedExperiment"
## [214] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [217] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [220] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [223] "parallel"             "stats4"               "stats"               
## [226] "graphics"             "grDevices"            "utils"               
## [229] "datasets"             "methods"              "base"                
## [232] "reshape"              "lattice"              "ggplot2"             
## [235] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [238] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [241] "gridExtra"            "tximport"             "limma"               
## [244] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [247] "BiocParallel"         "matrixStats"          "Biobase"             
## [250] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [253] "S4Vectors"            "BiocGenerics"         "parallel"            
## [256] "stats4"               "stats"                "graphics"            
## [259] "grDevices"            "utils"                "datasets"            
## [262] "methods"              "base"                 "mixOmics"            
## [265] "MASS"                 "reshape"              "lattice"             
## [268] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [271] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [274] "AnnotationDbi"        "gridExtra"            "tximport"            
## [277] "limma"                "DESeq2"               "SummarizedExperiment"
## [280] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [283] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [286] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [289] "parallel"             "stats4"               "stats"               
## [292] "graphics"             "grDevices"            "utils"               
## [295] "datasets"             "methods"              "base"                
## [298] "gplots"               "mixOmics"             "MASS"                
## [301] "reshape"              "lattice"              "ggplot2"             
## [304] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [307] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [310] "gridExtra"            "tximport"             "limma"               
## [313] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [316] "BiocParallel"         "matrixStats"          "Biobase"             
## [319] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [322] "S4Vectors"            "BiocGenerics"         "parallel"            
## [325] "stats4"               "stats"                "graphics"            
## [328] "grDevices"            "utils"                "datasets"            
## [331] "methods"              "base"                 "RColorBrewer"        
## [334] "gplots"               "mixOmics"             "MASS"                
## [337] "reshape"              "lattice"              "ggplot2"             
## [340] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [343] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [346] "gridExtra"            "tximport"             "limma"               
## [349] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [352] "BiocParallel"         "matrixStats"          "Biobase"             
## [355] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [358] "S4Vectors"            "BiocGenerics"         "parallel"            
## [361] "stats4"               "stats"                "graphics"            
## [364] "grDevices"            "utils"                "datasets"            
## [367] "methods"              "base"                 "readr"               
## [370] "RColorBrewer"         "gplots"               "mixOmics"            
## [373] "MASS"                 "reshape"              "lattice"             
## [376] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [379] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [382] "AnnotationDbi"        "gridExtra"            "tximport"            
## [385] "limma"                "DESeq2"               "SummarizedExperiment"
## [388] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [391] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [394] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [397] "parallel"             "stats4"               "stats"               
## [400] "graphics"             "grDevices"            "utils"               
## [403] "datasets"             "methods"              "base"                
## [406] "dplyr"                "readr"                "RColorBrewer"        
## [409] "gplots"               "mixOmics"             "MASS"                
## [412] "reshape"              "lattice"              "ggplot2"             
## [415] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [418] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [421] "gridExtra"            "tximport"             "limma"               
## [424] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [427] "BiocParallel"         "matrixStats"          "Biobase"             
## [430] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [433] "S4Vectors"            "BiocGenerics"         "parallel"            
## [436] "stats4"               "stats"                "graphics"            
## [439] "grDevices"            "utils"                "datasets"            
## [442] "methods"              "base"                 "VennDiagram"         
## [445] "futile.logger"        "dplyr"                "readr"               
## [448] "RColorBrewer"         "gplots"               "mixOmics"            
## [451] "MASS"                 "reshape"              "lattice"             
## [454] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [457] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [460] "AnnotationDbi"        "gridExtra"            "tximport"            
## [463] "limma"                "DESeq2"               "SummarizedExperiment"
## [466] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [469] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [472] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [475] "parallel"             "stats4"               "stats"               
## [478] "graphics"             "grDevices"            "utils"               
## [481] "datasets"             "methods"              "base"                
## [484] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [487] "dplyr"                "readr"                "RColorBrewer"        
## [490] "gplots"               "mixOmics"             "MASS"                
## [493] "reshape"              "lattice"              "ggplot2"             
## [496] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [499] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [502] "gridExtra"            "tximport"             "limma"               
## [505] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [508] "BiocParallel"         "matrixStats"          "Biobase"             
## [511] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [514] "S4Vectors"            "BiocGenerics"         "parallel"            
## [517] "stats4"               "stats"                "graphics"            
## [520] "grDevices"            "utils"                "datasets"            
## [523] "methods"              "base"                 "DOSE"                
## [526] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [529] "dplyr"                "readr"                "RColorBrewer"        
## [532] "gplots"               "mixOmics"             "MASS"                
## [535] "reshape"              "lattice"              "ggplot2"             
## [538] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [541] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [544] "gridExtra"            "tximport"             "limma"               
## [547] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [550] "BiocParallel"         "matrixStats"          "Biobase"             
## [553] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [556] "S4Vectors"            "BiocGenerics"         "parallel"            
## [559] "stats4"               "stats"                "graphics"            
## [562] "grDevices"            "utils"                "datasets"            
## [565] "methods"              "base"                 "org.Mm.eg.db"        
## [568] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [571] "futile.logger"        "dplyr"                "readr"               
## [574] "RColorBrewer"         "gplots"               "mixOmics"            
## [577] "MASS"                 "reshape"              "lattice"             
## [580] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [583] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [586] "AnnotationDbi"        "gridExtra"            "tximport"            
## [589] "limma"                "DESeq2"               "SummarizedExperiment"
## [592] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [595] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [598] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [601] "parallel"             "stats4"               "stats"               
## [604] "graphics"             "grDevices"            "utils"               
## [607] "datasets"             "methods"              "base"                
## [610] "pathview"             "org.Hs.eg.db"         "org.Mm.eg.db"        
## [613] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [616] "futile.logger"        "dplyr"                "readr"               
## [619] "RColorBrewer"         "gplots"               "mixOmics"            
## [622] "MASS"                 "reshape"              "lattice"             
## [625] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [628] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [631] "AnnotationDbi"        "gridExtra"            "tximport"            
## [634] "limma"                "DESeq2"               "SummarizedExperiment"
## [637] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [640] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [643] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [646] "parallel"             "stats4"               "stats"               
## [649] "graphics"             "grDevices"            "utils"               
## [652] "datasets"             "methods"              "base"                
## [655] "pathview"             "org.Hs.eg.db"         "org.Mm.eg.db"        
## [658] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [661] "futile.logger"        "dplyr"                "readr"               
## [664] "RColorBrewer"         "gplots"               "mixOmics"            
## [667] "MASS"                 "reshape"              "lattice"             
## [670] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [673] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [676] "AnnotationDbi"        "gridExtra"            "tximport"            
## [679] "limma"                "DESeq2"               "SummarizedExperiment"
## [682] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [685] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [688] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [691] "parallel"             "stats4"               "stats"               
## [694] "graphics"             "grDevices"            "utils"               
## [697] "datasets"             "methods"              "base"                
## [700] "knitr"                "pathview"             "org.Hs.eg.db"        
## [703] "org.Mm.eg.db"         "DOSE"                 "clusterProfiler"     
## [706] "VennDiagram"          "futile.logger"        "dplyr"               
## [709] "readr"                "RColorBrewer"         "gplots"              
## [712] "mixOmics"             "MASS"                 "reshape"             
## [715] "lattice"              "ggplot2"              "grid"                
## [718] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [721] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [724] "tximport"             "limma"                "DESeq2"              
## [727] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [730] "matrixStats"          "Biobase"              "GenomicRanges"       
## [733] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [736] "BiocGenerics"         "parallel"             "stats4"              
## [739] "stats"                "graphics"             "grDevices"           
## [742] "utils"                "datasets"             "methods"             
## [745] "base"

Analysis of all samples

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79 
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$gene_id))

# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Gene Counts/TNF")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
##  [1] "I_MKI1_3351_LIB038588_GEN00143067_R1.sf" 
##  [2] "I_MKI2_3638_LIB041682_GEN00156310_R1.sf" 
##  [3] "I_MKI3_3623_LIB041682_GEN00156311_R1.sf" 
##  [4] "I_MKI4_3838_LIB041682_GEN00156313_R1.sf" 
##  [5] "I_MKI5_3839_LIB041682_GEN00156314_R1.sf" 
##  [6] "I_MKI6_3841_LIB041682_GEN00156315_R1.sf" 
##  [7] "I_V1_3373_LIB038588_GEN00143070_R1.sf"   
##  [8] "I_V2_3378_LIB038588_GEN00143074_R1.sf"   
##  [9] "I_V3_3994_LIB038588_GEN00143084_R1.sf"   
## [10] "I_V4_3996_LIB038588_GEN00143085_R1.sf"   
## [11] "I_V5_3624_LIB041682_GEN00156312_R1.sf"   
## [12] "NI_MKI1_3352_LIB038588_GEN00143068_R1.sf"
## [13] "NI_MKI2_3906_LIB038588_GEN00143076_R1.sf"
## [14] "NI_MKI3_3925_LIB038588_GEN00143080_R1.sf"
## [15] "NI_MKI4_3946_LIB038588_GEN00143082_R1.sf"
## [16] "NI_MKI5_3374_LIB038588_GEN00143071_R1.sf"
## [17] "NI_V1_3376_LIB038588_GEN00143072_R1.sf"  
## [18] "NI_V2_3377_LIB038588_GEN00143073_R1.sf"  
## [19] "NI_V3_3904_LIB038588_GEN00143075_R1.sf"  
## [20] "NI_V4_3922_LIB038588_GEN00143078_R1.sf"  
## [21] "NI_V5_3924_LIB038588_GEN00143079_R1.sf"
names(countFiles) <- c('I_MKI_1','I_MKI_2','I_MKI_3','I_MKI_4','I_MKI_5','I_MKI_6', 'I_V_1', 'I_V_2', 'I_V_3', 'I_V_4', 'I_V_5', 'NI_MKI_1','NI_MKI_2','NI_MKI_3','NI_MKI_4', 'NI_MKI_5', 'NI_V_1', 'NI_V_2', 'NI_V_3', 'NI_V_4', 'NI_V_5')

txi.salmon <- tximport(countFiles, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all <- data.frame(condition = c(rep('experimental', 11), rep('control', 10)))

DESeqsampletable_all$batch <- factor(c('2','3','3','3','3','3','2','2','2','2','3','2','2','2','2','2','2','2','2','2','2'))
DESeqsampletable_all$gender <- factor(c('M', 'F', 'F', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M'))

rownames(DESeqsampletable_all) <- colnames(txi.salmon$counts)

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable_all, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- estimateSizeFactors(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
ddsHTSeq_norm <- DESeq(ddsHTSeq_norm)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 73 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Quality Inspection of the Gene Count Data

Generate raw count table that contains the simple counts of each gene

Data is transformed and pseudocount is calculated.

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
  ggplot(pseudoCount, aes(x= I_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_1"), 
  ggplot(pseudoCount, aes(x= I_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_2"),
  ggplot(pseudoCount, aes(x= I_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_3"),
  ggplot(pseudoCount, aes(x= I_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_4"),
  ggplot(pseudoCount, aes(x= I_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_5"),
  ggplot(pseudoCount, aes(x= I_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_6"), nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= I_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_1"),
  ggplot(pseudoCount, aes(x= I_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_2"),
  ggplot(pseudoCount, aes(x= I_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_3"),
  ggplot(pseudoCount, aes(x= I_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_4"),
  ggplot(pseudoCount, aes(x= I_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_5"), nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= NI_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_1"),
  ggplot(pseudoCount, aes(x= NI_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_2"),
  ggplot(pseudoCount, aes(x= NI_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_3"),
  ggplot(pseudoCount, aes(x= NI_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_4"),
  ggplot(pseudoCount, aes(x= NI_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_5"), nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= NI_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_1"),
  ggplot(pseudoCount, aes(x= NI_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_2"),
  ggplot(pseudoCount, aes(x= NI_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_3"),
  ggplot(pseudoCount, aes(x= NI_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_4"),
  ggplot(pseudoCount, aes(x= NI_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_5"), nrow = 2)

Between-sample distribution

Check on the gene count distribution across all genes.

#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,4))

ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3", "#E69F00", "#FF0000")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab(expression(log[2](count+1)))

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:6] != 0) >=2 | sum(rawCountTable[i,7:11] != 0) >= 2 | sum(rawCountTable[i,12:16] != 0) >= 2 | sum(rawCountTable[i,17:21] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
rawCountTable_transform_detected_TNF <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected_TNF, "VST_TNF.csv")

pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
  geom_text(aes(label=name), vjust = 2) + 
  xlim(-30, 35) + ylim(-20, 20)

I_V vs NI_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[17:21], countFiles[7:11])

txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(17:21, 7:11), ]

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_I_V vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering_I_V vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering_I_V vs NI_V.png')

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))

pdf('PCA_TNF_IV-vs-NIV.pdf',
    height = 6,
    width = 8)
PCA
dev.off()
## quartz_off_screen 
##                 2

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:10] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_V samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected_iv_vs_niv <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected_iv_vs_niv, "VST_I_V vs NI_V.csv")

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_iv_vs_niv)))
}
sig_count <- rawCountTable_transform_detected_iv_vs_niv[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:16] <- zscore(as.numeric(sig_dif[i,7:16]))
}

my_palette <- colorRampPalette(c("blue", "white", "red"))(128)
heatmap_matrix <- as.matrix(sig_dif[,7:16])

png('I_V vs NI_V RNASeq.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_V vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('I_V vs NI_V RNASeq.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "I_V vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('I_V vs NI_V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 1856
write.csv(sig_dif, "Significant genes_I_V vs NI_V.csv")

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:10])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
  xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_V vs NI_V Scatter Plot"))

pdf('ScatterPlot_I_V vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen 
##                 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot"))

pdf('MAPlot_I_V vs NI_V.pdf')
MA
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot"))
## Warning: Removed 42 rows containing missing values (geom_point).

pdf('VolcanoPlot_I_V vs NI_V.pdf')
volcano
## Warning: Removed 42 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

GO analysis for DE genes

Classic GO analysis is performed here for all DE genes detected in this dataset. The reference list is list of genes detected in RNASeq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.

target_gene <- as.character(rownames(sig_dif))
detected_gene <- as.character(rownames(detected_pseudocount))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO analysis/GO analysis_BP_I_V vs NI_V.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO analysis/GO analysis_MF_I_V vs NI_V.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO analysis/GO analysis_CC_I_V vs NI_V.csv")

Draw Dotplot representing the results

Biological process
png('GO analysis/GO dotplot_BP_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_dotplot

Molecular Function
png('GO analysis/GO dotplot_MF_I_V vs NI_V.png',
    width = 1200,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_dotplot

Cellular component
png('GO analysis/GO dotplot_CC_I_V vs NI_V.png',
    width = 1200,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_dotplot #### Draw enrichment Go plot representing the results ##### Biological process

png('GO analysis/GO enrichment_BP_I_V vs NI_V.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_enrichplot ##### Molecular function

png('GO analysis/GO enrichment_MF_I_V vs NI_V.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_enrichplot ##### Cellular component

png('GO analysis/GO enrichment_CC_I_V vs NI_V.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_enrichplot #### Draw category netplot representing the results The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). ##### Biological process

OE_foldchanges <- sig_dif$log2FoldChange
names(OE_foldchanges) <- rownames(sig_dif)

png('GO analysis/GO cnetplot_BP_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_cnetplot ##### Molecular function

png('GO analysis/GO cnetplot_MF_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_cnetplot ##### Cellular component

png('GO analysis/GO cnetplot_CC_I_V vs NI_V.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_cnetplot

I_MKI vs I_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:11], countFiles[1:6])

txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:11, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_I_MKI vs I_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering_I_MKI vs I_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering_I_MKI vs I_V.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))

pdf('PCA_TNF_IMKi-vs-IV.pdf',
    height = 6,
    width = 8)
PCA
dev.off()
## quartz_off_screen 
##                 2

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with less than 50 counts across all control or experimental samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
    keep <- c(keep, i)
  }
}

filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_MKI samples compared to I_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.1 (I decided to relax the padj cutoff here as well to keep it consistent with the TCT model analysis) were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_MKI vs I_V.csv")


dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:17])

png('I_MKI vs I_V RNASeq.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('I_MKI vs I_V RNASeq.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('I_MKI vs I_V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 18
write.csv(sig_dif, "Significant genes_I_MKI vs I_V.csv")

Draw heatmap of inflamed MK2i targets in NI_V vs I_V vs I_MK2i comparison of TNF and TCT

TNF

# rearrange the TCT master count matrix
rawCountTable_transform_detected_TNF <- read_csv("VST_TNF.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TNF$X1
rawCountTable_transform_detected_TNF$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TNF[,17:21], rawCountTable_transform_detected_TNF[,7:11], rawCountTable_transform_detected_TNF[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names

# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:22] <- zscore(as.numeric(sig_dif[i,7:22]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:22])

png('TNF MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png',
    width = 600,
    height = 700,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "TNF MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('TNF MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.pdf',
    width = 9,
    height = 11)
heatmap.2(heatmap_matrix,
          main = "TNF MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('TNF MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png')

TCT

# rearrange the TCT master count matrix
rawCountTable_transform_detected_TCT <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT/VST_TCT.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TCT$X1
rawCountTable_transform_detected_TCT$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TCT[,19:23], rawCountTable_transform_detected_TCT[,7:12], rawCountTable_transform_detected_TCT[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names

# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:23] <- zscore(as.numeric(sig_dif[i,7:23]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:23])

png('TNF MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png',
    width = 600,
    height = 700,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "TNF MK2i targets\nTCT I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('TNF MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.pdf',
    width = 9,
    height = 11)
heatmap.2(heatmap_matrix,
          main = "TNF MK2i targets\nTCT NI_V vs I_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          lwid = c(1,5),
          col=my_palette,
          cexCol = 1,
          margins = c(8,3),
          labRow = FALSE,
          trace = "none",
          dendrogram = "row",
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('TNF MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png')

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
  xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "I_MKI vs I_V Scatter Plot"))

pdf('ScatterPlot_I_MKi vs I_V.pdf')
scatter
dev.off()
## quartz_off_screen 
##                 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot"))

pdf('MAPlot_I_MKi vs I_V.pdf')
MA
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot"))
## Warning: Removed 29 rows containing missing values (geom_point).

pdf('VolcanoPlot_I_MKi vs I_V.pdf')
volcano
## Warning: Removed 29 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

NI_MKI vs NI_V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[17:21], countFiles[12:16])

txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(17:21, 12:16), ]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 5)))
DESeqsampletable$batch <- NULL

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_NI_MKI vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('Hierchical_Clustering_NI_MKI vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Hierchical_Clustering_NI_MKI vs NI_V.png')

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))

pdf('PCA_TNF_NIMKi-vs-NIV.pdf',
    width = 8,
    height = 6)
PCA
dev.off()
## quartz_off_screen 
##                 2

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with less than 50 counts in average across all control or experimental samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:10] != 0) >= 2) {
    keep <- c(keep, i)
  }
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_NI_MKI vs NI_V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_NI_MKI vs NI_V.csv")

Draw heatmap for transcripts that are significantly dysregulated in NI_MKI samples compared to NI_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.1 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_NI_MKI vs NI_V.csv")


dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:16] <- zscore(as.numeric(sig_dif[i,7:16]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:16])

png('NI_MKI vs NI_V RNASeq.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('NI_MKI vs NI_V RNASeq.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.25",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('NI_MKI vs NI_V RNASeq.png')

# output number of significant DE genes
dim(sig_dif)[1]
## [1] 20
write.csv(sig_dif, "Significant genes_NI_MKI vs NI_V.csv")

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$NI_MKI_mean <- rowMeans(detected_pseudocount[,6:10])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = NI_MKI_mean)) +
  xlab("NI_V_Average(log2)") + ylab("NI_MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "NI_MKI vs NI_V Scatter Plot"))

pdf('ScatterPlot_NI_MKi vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen 
##                 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V MA Plot"))

pdf('MAPlot_NI_MKi vs NI_V.pdf')
MA
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Volcano Plot"))
## Warning: Removed 53 rows containing missing values (geom_point).

pdf('VolcanoPlot_NI_MKi vs NI_V.pdf')
volcano
## Warning: Removed 53 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

MKI vs V

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate DESeqData using the counting result generated using Salmon
countFiles_all <- c(countFiles[17:21], countFiles[7:11], countFiles[12:16], countFiles[1:6])

txi.salmon <- tximport(countFiles_all, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(17:21, 7:11, 12:16, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 10), rep('experimental', 11)))

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF")
png('Hierchical_Clustering_MKI vs V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) 

  #geom_text(aes(label=name), vjust = 2) +
  #xlim(-30, 20) + ylim(-25, 15)

Differential analysis

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with less than 50 counts across all control or experimental samples, output a csv file and also generate a density plot using filtered dataset.

keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
  if (sum(rawCountTable[i,1:10] != 0) >=2 | sum(rawCountTable[i,11:21] != 0) >= 2) {
    keep <- c(keep, i)
  }
}

filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_MKI vs V.csv")

ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_MKI vs V.csv")

Draw heatmap for transcripts that are significantly dysregulated in I_MKI samples compared to I_V samples

Genes that were not detected were removed from the list. Genes with padj < 0.1 (I decided to relax the padj cutoff here as well to keep it consistent with the TCT model analysis) were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_MKI vs V.csv")


dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:27] <- zscore(as.numeric(sig_dif[i,7:27]))
}

heatmap_matrix <- as.matrix(sig_dif[,7:27])

png('MKI vs V RNASeq.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "MKI vs V RNASeq\npadj < 0.1\n LFC > 0.1",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "both",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 11
write.csv(sig_dif, "Significant genes_MKI vs V.csv")

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$MKI_mean <- rowMeans(detected_pseudocount[,11:21])
dif_analysis$V_mean <- rowMeans(detected_pseudocount[,1:10])
ggplot(dif_analysis, aes(x = V_mean, y = MKI_mean)) +
  xlab("V_Average(log2)") + ylab("MKI_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "MKI vs V Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V Volcano Plot")
## Warning: Removed 39 rows containing missing values (geom_point).

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] mosaic_1.6.0                Matrix_1.2-18              
##  [3] mosaicData_0.17.0           ggformula_0.9.4            
##  [5] ggstance_0.3.4              knitr_1.28                 
##  [7] pathview_1.26.0             org.Hs.eg.db_3.10.0        
##  [9] org.Mm.eg.db_3.10.0         DOSE_3.12.0                
## [11] clusterProfiler_3.14.3      VennDiagram_1.6.20         
## [13] futile.logger_1.4.3         dplyr_0.8.5                
## [15] readr_1.3.1                 RColorBrewer_1.1-2         
## [17] gplots_3.0.3                mixOmics_6.10.9            
## [19] MASS_7.3-51.6               reshape_0.8.8              
## [21] lattice_0.20-41             ggplot2_3.3.0              
## [23] EnsDb.Mmusculus.v79_2.99.0  ensembldb_2.10.2           
## [25] AnnotationFilter_1.10.0     GenomicFeatures_1.38.2     
## [27] AnnotationDbi_1.48.0        gridExtra_2.3              
## [29] tximport_1.14.2             limma_3.42.2               
## [31] DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [33] DelayedArray_0.12.3         BiocParallel_1.20.1        
## [35] matrixStats_0.56.0          Biobase_2.46.0             
## [37] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [39] IRanges_2.20.2              S4Vectors_0.24.4           
## [41] BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.0.0         RSQLite_2.2.0            htmlwidgets_1.5.1       
##   [4] munsell_0.5.0            withr_2.2.0              colorspace_1.4-1        
##   [7] GOSemSim_2.12.1          rstudioapi_0.11          labeling_0.3            
##  [10] KEGGgraph_1.46.0         urltools_1.7.3           GenomeInfoDbData_1.2.2  
##  [13] polyclip_1.10-0          bit64_0.9-7              farver_2.0.3            
##  [16] generics_0.0.2           vctrs_0.2.4              lambda.r_1.2.4          
##  [19] xfun_0.13                BiocFileCache_1.10.2     R6_2.4.1                
##  [22] graphlayouts_0.7.0       locfit_1.5-9.4           bitops_1.0-6            
##  [25] fgsea_1.12.0             gridGraphics_0.5-0       assertthat_0.2.1        
##  [28] scales_1.1.0             ggraph_2.0.2             nnet_7.3-14             
##  [31] enrichplot_1.6.1         gtable_0.3.0             tidygraph_1.1.2         
##  [34] rlang_0.4.5              genefilter_1.68.0        splines_3.6.3           
##  [37] rtracklayer_1.46.0       lazyeval_0.2.2           acepack_1.4.1           
##  [40] broom_0.5.6              mosaicCore_0.6.0         europepmc_0.3           
##  [43] checkmate_2.0.0          BiocManager_1.30.10      yaml_2.2.1              
##  [46] reshape2_1.4.4           crosstalk_1.1.0.1        backports_1.1.6         
##  [49] qvalue_2.18.0            Hmisc_4.4-0              tools_3.6.3             
##  [52] ggplotify_0.0.5          ellipsis_0.3.0           ggdendro_0.1-20         
##  [55] ggridges_0.5.2           Rcpp_1.0.4               plyr_1.8.6              
##  [58] base64enc_0.1-3          progress_1.2.2           zlibbioc_1.32.0         
##  [61] purrr_0.3.4              RCurl_1.98-1.2           prettyunits_1.1.1       
##  [64] rpart_4.1-15             openssl_1.4.1            viridis_0.5.1           
##  [67] cowplot_1.0.0            ggrepel_0.8.2            cluster_2.1.0           
##  [70] magrittr_1.5             data.table_1.12.8        RSpectra_0.16-0         
##  [73] futile.options_1.0.1     DO.db_2.9                triebeard_0.3.0         
##  [76] ProtGenerics_1.18.0      hms_0.5.3                evaluate_0.14           
##  [79] xtable_1.8-4             leaflet_2.0.3            XML_3.99-0.3            
##  [82] jpeg_0.1-8.1             compiler_3.6.3           biomaRt_2.42.1          
##  [85] ellipse_0.4.1            tibble_3.0.1             KernSmooth_2.23-17      
##  [88] crayon_1.3.4             htmltools_0.4.0          corpcor_1.6.9           
##  [91] Formula_1.2-3            tidyr_1.0.2              geneplotter_1.64.0      
##  [94] DBI_1.1.0                tweenr_1.0.1             formatR_1.7             
##  [97] dbplyr_1.4.3             rappdirs_0.3.1           gdata_2.18.0            
## [100] igraph_1.2.5             pkgconfig_2.0.3          rvcheck_0.1.8           
## [103] GenomicAlignments_1.22.1 foreign_0.8-76           xml2_1.3.2              
## [106] rARPACK_0.11-0           annotate_1.64.0          XVector_0.26.0          
## [109] stringr_1.4.0            digest_0.6.25            graph_1.64.0            
## [112] Biostrings_2.54.0        rmarkdown_2.1            fastmatch_1.1-0         
## [115] htmlTable_1.13.3         curl_4.3                 Rsamtools_2.2.3         
## [118] gtools_3.8.2             nlme_3.1-147             lifecycle_0.2.0         
## [121] jsonlite_1.6.1           viridisLite_0.3.0        askpass_1.1             
## [124] pillar_1.4.3             KEGGREST_1.26.1          httr_1.4.1              
## [127] survival_3.1-12          GO.db_3.10.0             glue_1.4.0              
## [130] png_0.1-7                bit_1.1-15.2             Rgraphviz_2.30.0        
## [133] ggforce_0.3.1            stringi_1.4.6            blob_1.2.1              
## [136] latticeExtra_0.6-29      caTools_1.18.0           memoise_1.1.0